{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "import sys,random\n",
    "import pickle\n",
    "import os\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import modin.pandas as pd\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('featurescore'):\n",
    "    os.mkdir('featurescore')\n",
    "if not os.path.exists('model'):\n",
    "    os.mkdir('model')\n",
    "if not os.path.exists('preds'):\n",
    "    os.mkdir('preds')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取有标签的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 5s, sys: 4.9 s, total: 1min 10s\n",
      "Wall time: 1min 17s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# 导入训练数据集\n",
    "data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])\n",
    "data_raw = pd.read_csv('../../preprocess_data_new/train_ax_nodup.csv',nrows=33465).drop(columns=['id','loan_dt','tag'])\n",
    "data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])\n",
    "data_tag = pd.read_csv('../../preprocess_data/train_x_33465.csv',usecols=['tag'])\n",
    "\n",
    "data = pd.concat([data_date,data_raw,data_null,data_tag],axis=1)\n",
    "data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])\n",
    "\n",
    "x = data.fillna(-1).values\n",
    "y = data_label.values.ravel()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用旧数据集 train集的维度为：  \n",
    "使用新数据集 train集的维度为：6702"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取无标签的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2min 16s, sys: 20.7 s, total: 2min 36s\n",
      "Wall time: 2min 47s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "unlabel_date = pd.read_csv('../../preprocess_data/unlabel_x_date.csv').drop(columns=['id','loan_hour'])\n",
    "unlabel_raw = pd.read_csv('../../preprocess_data_new/train_ax_nodup.csv',skiprows=range(1,33466)).drop(columns=['id','loan_dt','tag'])\n",
    "unlabel_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',skiprows=range(1,33466)).drop(columns=['id'])\n",
    "unlabel_tag = pd.read_csv('../../preprocess_data/unlabel_x_66535.csv',usecols=['tag'])\n",
    "unlabel = pd.concat([unlabel_date,unlabel_raw,unlabel_null,unlabel_tag],axis=1)\n",
    "x_test = unlabel.fillna(-1).values\n",
    "label = pd.read_csv('../predict_unlabel/preds/unlabel_pred.csv',usecols=['prob'])\n",
    "label.columns = ['label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0    3327\n",
       "0.0    3327\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "low,high = np.percentile(label.label,5),np.percentile(label.label,95)\n",
    "myfilter = (label.label<low)|(label.label>high)\n",
    "\n",
    "unlabel_f = unlabel[myfilter]\n",
    "label_f = label[myfilter].copy()\n",
    "\n",
    "label_f[label_f<low] = 0\n",
    "label_f[label_f>high] = 1\n",
    "\n",
    "label_f.label.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 合并数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_com = pd.concat([data,unlabel_f],copy=False,axis=0,ignore_index=True)\n",
    "label_com = pd.concat([data_label,label_f],copy=False,axis=0,ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train,y_train= data_com.fillna(-1).values, label_com.values.ravel()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 定义学习器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SelectModel(model_name):\n",
    "    if model_name == 'GBC':\n",
    "        from sklearn.ensemble import GradientBoostingClassifier\n",
    "        model = GradientBoostingClassifier(loss='deviance',\n",
    "                                           learning_rate =0.1,\n",
    "                                           n_estimators=300,\n",
    "                                           subsample=0.9,\n",
    "                                           max_depth=3,\n",
    "#                                            verbose=1,\n",
    "                                          random_state=2018)\n",
    "    elif model_name == 'XGB':\n",
    "        from xgboost import XGBClassifier\n",
    "\n",
    "        model = XGBClassifier(max_depth=6,\n",
    "                              learning_rate =0.04, \n",
    "                              booster='gbtree',\n",
    "                              objective='binary:logistic',\n",
    "                              early_stopping_rounds=100,\n",
    "                              scale_pos_weight=float(len(y_train)-np.sum(y_train))/float(np.sum(y_train)),\n",
    "                              eval_metric='auc',\n",
    "                              gamma=1,\n",
    "                              reg_lambda=1,\n",
    "                              subsample=0.9,\n",
    "                              min_child_weight=1,\n",
    "                              seed=2018,\n",
    "                              silent=False,\n",
    "                              n_jobs=24,\n",
    "                              num_boost_round =300\n",
    "                             )\n",
    "    elif model_name == 'RFC':\n",
    "        from sklearn.ensemble import RandomForestClassifier\n",
    "        model = RandomForestClassifier(n_estimators=1500,\n",
    "                                       n_jobs =36,\n",
    "                                       max_features='sqrt',\n",
    "                                       class_weight='balanced',\n",
    "#                                        verbose =1,\n",
    "                                       random_state=2018)\n",
    "    elif model_name == 'LGB':\n",
    "        from lightgbm import LGBMClassifier\n",
    "        model = LGBMClassifier(boost='gbdt',\n",
    "                    num_leaves=135, \n",
    "                    scale_pos_weight=float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),\n",
    "                    max_depth=-1,\n",
    "                    learning_rate=.05,\n",
    "                    max_bin=200,\n",
    "                    min_data_in_leaf= 60,\n",
    "                    objective='binary',\n",
    "                    metric='auc',\n",
    "                    num_threads=32,\n",
    "                    slient=False,\n",
    "                    num_boost_round =300)\n",
    "    else:\n",
    "        pass\n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "XGB\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import joblib\n",
    "model_list = ['XGB','RFC','LGB']\n",
    "unlabel_pred_list = []\n",
    "path = './model_5_95'\n",
    "if not os.path.exists(path):\n",
    "    os.mkdir(path)\n",
    "\n",
    "for model in model_list:\n",
    "    print(model)\n",
    "    clf = SelectModel(model)\n",
    "    clf.fit(x_train,y_train)\n",
    "    joblib.dump(clf,os.path.join(path,model))  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 31.9 s, sys: 1.29 s, total: 33.2 s\n",
      "Wall time: 33.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "valid_date = pd.read_csv('../../preprocess_data/valid_date.csv').drop(columns=['id','loan_hour'])\n",
    "# valid_raw = pd.read_csv('../../preprocess_data/valid.csv').drop(columns=['id','loan_dt'])\n",
    "valid_raw = pd.read_csv('../../preprocess_data_new/valid_nodup.csv').drop(columns=['id','loan_dt'])\n",
    "valid_null = pd.read_csv('../../preprocess_data_new/valid_row_null.csv').drop(columns=['id'])\n",
    "valid_tag = pd.read_csv('../predict_tag/tag.csv',usecols=['tag'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.32 s, sys: 4 ms, total: 1.33 s\n",
      "Wall time: 1.32 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "valid = pd.concat([valid_date,valid_raw,valid_null,valid_tag],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_id = pd.read_csv('../../preprocess_data_new/valid_date.csv',usecols=['id']).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "dtest = xgb.DMatrix(valid.fillna(-1).values, feature_names=list(valid.columns))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "raw 去除nan列 + 统计null AUC: 0.82751113123698  \n",
    "nodup + null + tag AUC: 0.82803008109167  \n",
    "nodup + null + tag（rank融合）AUC:0.8279914450872  \n",
    "nodup + null + tag (fillna(-1)) AUC:0.82979480823375"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(33465, 6702)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
